import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
from matplotlib.dates import date2num
import geopandas as gpd
from shapely.geometry import Point,Polygon
import plotly.graph_objects as go
import plotly.express as px
import plotly_express as px
import squarify
# PCA & MDS #
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from scipy.spatial.distance import pdist, cdist
df_gender=pd.read_csv("data on gender on england population.csv")
df_gender = df_gender.iloc[1: , :]
df_gender = df_gender.drop(['CDU_ID','GEO_TYP2'], axis=1)
df_gender
df_gender['GEO_CODE']=='Shropshire'
Key = df_gender[df_gender['GEO_LABEL']=='Bedfordshire']
Key = df_gender[df_gender['GEO_CODE']=='E06000055']
Berkshire = "West Berkshire" BRISTOL = "Bristol, City of" Cheshire = "Cheshire East" Cornwall = "Cornwall, Isles of Scilly" Derbyshire = "Derbyshire Dales" Devon = "Mid Devon" Durham = "County Durham" Herefordshire = "Herefordshire, County of" Kingston upon Hull = ""Kingston upon Hull, City of" Westminster = "City of London, Westminster" Stockton-on-Trent = "Stoke-on-Trent" Bedfordshire = "Central Bedfordshire" London = "City of London 001"
Bedfordshire = "Central Bedfordshire"
Key
["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
"E09000002",
Barking and Dagenham - E09000002
Bath and north east somerset - E06000022
Bedfordshire - E06000056
Berkshire - E06000037
Bexley - E09000004
Blackburn with Darwen - E06000008
Bournemouth - E06000028
Brent - E09000005
Brighton and Hove - E06000043
Bristol - E06000023
Bromley - E09000006
Buckinghamshire - E10000002
Cambridgeshire - E07000009
Camden - E09000007
Cheshire - E06000049
Cornwall - E06000052
Croydon - E09000008
Cumbria - E10000006
Darlington - E06000005
Derbyshire - E07000035
Derby - E06000015
Devon - E07000042
Dorset - E07000049
Durham - E06000047
Eailing- E09000009
East riding of Yorkshire - E06000011
East sussex - E07000228
Enfield - E09000010
Essex - E10000012
Gloucestershire - E06000025
Greenwich - E09000011
Hackney - E09000012
Halton - E06000006
Hammersmith and Fulham - E09000013
Hampshire - E07000085
Haringey - E09000014
Harrow - E09000015
Hartlepool - E06000001
Havering - E09000016
Herefordshire - E06000019
Hertfordshire - E07000099
Hillingdon - E09000017
Hounslow - E09000018
Isle of Wight - E06000046
Islington - E09000019
Kinsington and Chelsea - E09000020
Kent - E10000016
Kingston upon Hull - E06000010
Lambeth - E09000022
Lancashire - E07000127
Leicestershire - E07000134
Leicester - E06000016
Lewisham - E09000023
Lincolnshire - E06000013
City of London - E09000001
Luton - E06000032
Manchester - E08000003
Medway - E06000035
Merseyside - E11000002
Merton - E09000024
Middlesbrough - E06000002
Milton Keynes - E06000042
Newham - E09000025
Norfolk - E10000020
North East Lincolnshire - E06000012
North Somerset - E06000024
North Yorkshire - E10000023
Northamptonshire - E10000021
Northumberland - E06000048
Nottinghamshire - E10000024
Nottingham - E06000018
Oxfordshire - E10000025
Peterborough - E06000031
Plymouth - E06000026
Poole - E06000029
Portsmouth - E06000044
Redbridge - E09000026
Redcar and cleveland - E06000003
Richmond Upon Thames - E09000027
Rutland - E06000017
Shropshire - E06000051
Somerset - E10000027
South Gloucestershire - E06000025
South Yorkshire - E11000003
Southampton - E06000045
Southend-on-sea - E06000033
Southwark - E09000028
Staffordshire - E10000028
Stockton-on-tees - E06000004
Stoke-on-trent - E06000021
Suffolk - E10000029
Surrey - E10000030
Sutton - E09000029
Swindon - E06000030
Telford and wrekin - E06000020
Thurrock - E06000034
Torbay - E06000027
Tower Hamlets - E09000030
Tyne and Wear - E11000004
Waltham Forest - E09000031
Wandsworth - E09000032
Warrington - E06000007
Warwichshire - E10000031
West Midlands - E11000005
West Sussex - E10000032
West Yorkshire - E11000006
Westminster - E09000033
Wiltshire - E06000054
Worcesstershire - E10000034
York - E06000014
Kingston upon Thames - E09000021
Milton Keynes - E06000042
Colate all codes which corresponds to our shape files visualised on QGIS
E09000002,E06000022,E06000056,E06000037,E09000004,E06000008,E06000028,E09000005,E06000043,E06000023,E09000006,E10000002,E07000009,E09000007,E06000049,E06000052,E09000008,E10000006,E06000005,E07000035,E06000015,E07000042,E07000049,E06000047,E09000009,E06000011,E07000228,E09000010,E10000012,E06000025,E09000011,E09000012,E06000006,E09000013,E07000085,E09000014,E09000015,E06000001,E09000016,E06000019,E07000099,E09000017,E09000018,E06000046,E09000019,E09000020,E10000016,E06000010,E09000022,E07000127,E07000134,E06000016,E09000023,E06000013,E09000001,E06000032,E08000003,E06000035,E11000002,E09000024,E06000002,E06000042,E09000025,E10000020,E06000012,E06000024,E10000023,E10000021,E06000048,E10000024,E06000018,E10000025,E06000031,E06000026,E06000029,E06000044,E09000026,E06000003,E09000027,E06000017,E06000051,E10000027,E06000025,E11000003,E06000045,E06000033,E09000028,E10000028,E06000004,E06000021,E10000029,E10000030,E09000029,E06000030,E06000020,E06000034,E06000027,E09000030,E11000004,E09000031,E09000032,E06000007,E10000031,E11000005,E10000032,E11000006,E09000033,E06000054,E06000014,E09000021,E10000034
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
filtered_Gender = df_gender[df_gender['GEO_CODE'].isin(GeoCodes)]
print(filtered_Gender )
filtered_Gender.head()
#change one name using key slice
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
#Check if change of name works
Key = filtered_Gender[filtered_Gender['GEO_CODE']=='E06000021']
Key
#Change name of countes to match shape files on QGIS
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
filtered_Gender.loc[filtered_Gender.GEO_CODE == 'E06000056', 'GEO_LABEL'] = "Bedfordshire"
filtered_Gender = filtered_Gender.drop(['GEO_TYPE'], axis=1)
filtered_Gender.head()
# Transform the data into a numpy ndarray
np_array_gender = filtered_Gender.values
# Removing the first four columns
np_array_gender = np_array_gender[:, 3:]
# Building the PCA model
pca_gender = PCA(n_components=2).fit(np_array_gender) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_g = pca_gender.transform(np_array_gender) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame
# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_v variable has a shape of :", pca_x_g.shape)
print(pca_x_g)
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs 'Gender' ")
plt.scatter(pca_x_g[:,0], pca_x_g[:,1],cmap='viridis')
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_gender.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_Gender.columns[3+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)
# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_gender.components_[1])[np.argsort(np.abs(pca_gender.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_Gender.columns[3+np.argsort(np.abs(pca_gender.components_[1]))][::-1][:10]
# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(3):
print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))
print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(3):
print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Sex : Males - Unit : Persons', data=filtered_Gender)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Sex : Females - Unit : Persons', data=filtered_Gender)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
filtered_Gender = filtered_Gender.drop(['Sex : Total\ Sex - Unit : Persons'], axis=1)
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
gender_num = filtered_Gender.select_dtypes(include='number') # selects numeric columns only
gender_num.head()
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat = cdist(gender_num, gender_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat.shape)
mds_gender = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_gender = mds_gender.fit_transform(dist_mat)
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_gender[:, 0], mds_x_gender[:, 1], alpha = .3, s=100)
# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_Gender['GEO_LABEL']):
plt.annotate(s=name, xy=(mds_x_gender[index, 0], mds_x_gender[index, 1]))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
df_vehi=pd.read_csv("Vehicle accessibility per household in England Data.csv")
df_vehi = df_vehi.iloc[1: , :]
df_vehi
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
filtered_vehi = df_vehi[df_vehi['GEO_CODE'].isin(GeoCodes)]
filtered_vehi.head()
filtered_vehi = filtered_vehi.drop('CDU_ID', 1)
#filtered_vehi = filtered_vehi.drop('CDU_ID', 1)
filtered_vehi.corr()
Car or van availability : Total\ Car or van availability - Unit : Households
Car or van availability : Total\ Car or van availability - Unit : Cars or vans
Car or van availability : No cars or vans in household - Unit : Households
Car or van availability : 1 car or van in household - Unit : Households
Car or van availability : 2 cars or vans in household - Unit : Households
Car or van availability : 3 cars or vans in household - Unit : Households
Car or van availability : 4 or more cars or vans in household - Unit : Households
Car or van availability : Sum of all cars or vans - Unit : Cars or vans
filtered_vehi.rename(columns = {'Car or van availability : Total\ Car or van availability - Unit : Households':'Total Number of households which have access to a Vehicle ',
'Car or van availability : Total\ Car or van availability - Unit : Cars or vans':'Total number of cars which are privately accessible to citizens(Owned/leased)',
'Car or van availability : No cars or vans in household - Unit : Households':'Total Number of households which do not have access to vehicles',
'Car or van availability : 1 car or van in household - Unit : Households':'Number of households which have access to 1 vehicle',
'Car or van availability : 2 cars or vans in household - Unit : Households':'Number of households which have access to 2 vehicles',
'Car or van availability : 3 cars or vans in household - Unit : Households':'Number of households which have accessibilty to 3 vehicles',
'Car or van availability : 4 or more cars or vans in household - Unit : Households':'Number of households which have access to 4 or more vehicles'}, inplace = True)
filtered_vehi.head()
filtered_vehi = filtered_vehi.drop(['GEO_TYPE'], axis=1)
filtered_vehi = filtered_vehi.drop(['GEO_TYP2'], axis=1)
filtered_vehi.head()
filtered_vehi = filtered_vehi.drop('Car or van availability : Sum of all cars or vans - Unit : Cars or vans', 1)
filtered_vehi
filtered_vehi.corr()
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_vehi.loc[filtered_vehi.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
# Transform the data into a numpy ndarray
np_array_vehi = filtered_vehi.values
# Removing the first four columns
np_array_vehi = np_array_vehi[:, 4:]
np_array_vehi
# Building the PCA model
pca_vehi = PCA(n_components=2).fit(np_array_vehi) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_v = pca_vehi.transform(np_array_vehi) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame
# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_v variable has a shape of :", pca_x_v.shape)
print(pca_x_v)
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_v[:,0], pca_x_v[:,1],cmap='viridis')
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_vehi.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_vehi.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)
# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_vehi.components_[1])[np.argsort(np.abs(pca_vehi.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_vehi.columns[4+np.argsort(np.abs(pca_vehi.components_[1]))][::-1][:10]
# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))
print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Total Number of households which do not have access to vehicles', data=filtered_vehi)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Number of households which have access to 1 vehicle', data=filtered_vehi)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
vehi_num = filtered_vehi.select_dtypes(include='number') # selects numeric columns only
vehi_num.head()
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat = cdist(vehi_num, vehi_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat.shape)
mds_vehi = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_vehi = mds_vehi.fit_transform(dist_mat)
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_vehi[:, 0], mds_x_vehi[:, 1], alpha = .3, s=100)
# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_vehi['GEO_LABEL']):
plt.annotate(s=name, xy=(mds_x_vehi[index, 0], mds_x_vehi[index, 1]))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
plt.figure(num=2, figsize = (16, 10)) plt.subplot(2,2,1) ax1 = sns.regplot(filtered_vehi['White %'], filtered_vehi['inEmployment'], order=1, scatter_kws={'alpha':0.3}, line_kws={'color':'tomato'}) ax1.set(title = 'Regression : Order 1') plt.subplot(2,2,2) ax2 = sns.regplot(filtered_vehi['Black %'], filtered_vehi['inEmployment'], order=1, scatter_kws={'alpha':0.3}, line_kws={'color':'tomato'}) ax2.set(title = 'Regression : Order 2') plt.subplot(2,2,3) ax3 = sns.regplot(filtered_vehi['Asian %'], filtered_vehi['inEmployment'], order=1, scatter_kws={'alpha':0.3}, line_kws={'color':'tomato'}) ax3.set(title = 'Regression : Order 3') plt.subplot(2,2,4) ax4 = sns.regplot(filtered_vehi['Mixed %'], filtered_vehi['inEmployment'], order=1, scatter_kws={'alpha':0.3}, line_kws={'color':'tomato'}) ax4.set(title = 'Regression : Order 4') plt.tight_layout();
df_reli=pd.read_csv("data on religion of England population .csv")
df_reli = df_reli.iloc[1: , :]
df_reli
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
filtered_reli = df_reli[df_reli['GEO_CODE'].isin(GeoCodes)]
filtered_reli.head()
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_reli.loc[filtered_reli.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
filtered_reli = filtered_reli.drop(['GEO_TYP2'], axis=1)
filtered_reli = filtered_reli.drop(['GEO_TYPE'], axis=1)
filtered_reli.head()
# Transform the data into a numpy ndarray
np_array_reli = filtered_reli.values
# Removing the first four columns
np_array_reli = np_array_reli[:, 5:]
np_array_reli
# Building the PCA model
pca_reli = PCA(n_components=2).fit(np_array_reli) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_r = pca_reli.transform(np_array_reli) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame
# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_r variable has a shape of :", pca_x_r.shape)
print(pca_x_r)
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_r[:,0], pca_x_r[:,1],cmap='viridis')
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_reli.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_reli.columns[5+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)
# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_reli.components_[1])[np.argsort(np.abs(pca_reli.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_reli.columns[5+np.argsort(np.abs(pca_reli.components_[1]))][::-1][:10]
# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))
print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Christian - Unit : Persons', data=filtered_reli)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Muslim - Unit : Persons', data=filtered_reli)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Hindu - Unit : Persons', data=filtered_reli)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Sikh - Unit : Persons', data=filtered_reli)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : Total\ Religion - Unit : Persons', data=filtered_reli)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
# Change to a bit better style and larger figure.
sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(35, 6))
# Plot our violins.
sns.violinplot(x='GEO_LABEL', y= 'Religion [E][S][W] : No religion - Unit : Persons', data=filtered_reli)
# Rotate the x-axis labels and remove the plot border on the left.
_ = plt.xticks(rotation=45, ha='right')
sns.despine(left=True)
filtered_reli = filtered_reli.drop('CDU_ID', 1)
#filtered_vehi = filtered_vehi.drop('CDU_ID', 1)
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
reli_num = filtered_reli.select_dtypes(include='number') # selects numeric columns only
reli_num.head()
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_reli = cdist(reli_num, reli_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_reli.shape)
mds_reli = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_reli = mds_reli.fit_transform(dist_mat_reli)
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_reli[:, 0], mds_x_reli[:, 1], alpha = .3, s=100)
# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_reli['GEO_LABEL']):
plt.annotate(s=name, xy=(mds_x_reli[index, 0], mds_x_reli[index, 1]))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
df_health=pd.read_csv("Data on population health in England .csv")
df_health = df_health.iloc[1: , :]
df_health
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
filtered_health = df_health[df_health['GEO_CODE'].isin(GeoCodes)]
filtered_health.head()
filtered_health = filtered_health.drop(['General health : Total\ General health - Unit : Persons'], axis=1)
filtered_health
filtered_health.loc[filtered_health.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_health.loc[filtered_health.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_health.loc[filtered_health.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_health.loc[filtered_health.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_health.loc[filtered_health.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_health.loc[filtered_health.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_health.loc[filtered_health.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
filtered_health = filtered_health.drop(['GEO_TYPE'], axis=1)
filtered_health = filtered_health.drop(['GEO_TYP2'], axis=1)
filtered_health.head()
# Transform the data into a numpy ndarray
np_array_health = filtered_health.values
np_array_health
# Removing the first four columns
np_array_health = np_array_health[:, 5:]
# Building the PCA model
pca_health = PCA(n_components=2).fit(np_array_health) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_h = pca_health.transform(np_array_health) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame
# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_h variable has a shape of :", pca_x_h.shape)
print(pca_x_h)
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_h[:,0], pca_x_h[:,1],cmap='viridis')
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_health.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_health.columns[5+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)
# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_health.components_[1])[np.argsort(np.abs(pca_health.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_health.columns[5+np.argsort(np.abs(pca_health.components_[1]))][::-1][:10]
# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))
print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
filtered_health = filtered_health.drop(['CDU_ID'], axis=1)
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
health_num = filtered_health.select_dtypes(include='number') # selects numeric columns only
health_num.head()
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_health = cdist(health_num, health_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_health.shape)
mds_health = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_health = mds_health.fit_transform(dist_mat_health)
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_health[:, 0], mds_x_health[:, 1], alpha = .3, s=100)
# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_health['GEO_LABEL']):
plt.annotate(s=name, xy=(mds_x_health[index, 0], mds_x_health[index, 1]))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
The dimensions of deprivation used to classify households are indicators based on four selected household characteristics. A household is deprived in a dimension if they meet one or more of the following conditions:
df_deprivation=pd.read_csv("Deprevation in England Population.csv")
df_deprivation = df_deprivation.iloc[1: , :]
df_deprivation.head()
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
filtered_dep = df_deprivation[df_deprivation['GEO_CODE'].isin(GeoCodes)]
filtered_dep.head()
Deprivation; classification of household [E][S][W] : Total\ Classification of household deprivation - Unit : Households
Deprivation; classification of household [E][S][W] : Household is not deprived in any dimension - Unit : Households
Deprivation; classification of household [E][S][W] : Household is deprived in 1 dimension - Unit : Households
Deprivation; classification of household [E][S][W] : Household is deprived in 2 dimensions - Unit : Households
Deprivation; classification of household [E][S][W] : Household is deprived in 3 dimensions - Unit : Households
Deprivation; classification of household [E][S][W] : Household is deprived in 4 dimensions - Unit : Households
filtered_dep.rename(columns = {'Deprivation; classification of household [E][S][W] : Total\ Classification of household deprivation - Unit : Households':'Total households examined using deprivation indexes',
'Deprivation; classification of household [E][S][W] : Household is not deprived in any dimension - Unit : Households':'Number of households which are not deprived in any of the scrutinised dimensions',
'Deprivation; classification of household [E][S][W] : Household is deprived in 1 dimension - Unit : Households':'Number of households which are deprived by 1 of the scrutinised dimensions',
'Deprivation; classification of household [E][S][W] : Household is deprived in 2 dimensions - Unit : Households':'Number of households which are deprived by 2 of the scrutinised dimensions',
'Deprivation; classification of household [E][S][W] : Household is deprived in 3 dimensions - Unit : Households':'Number of households which are deprived by 3 of the scrutinised dimensions',
'Deprivation; classification of household [E][S][W] : Household is deprived in 4 dimensions - Unit : Households':'Number of households which are deprived by 4 of the scrutinised dimensions'}, inplace = True)
filtered_dep
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_dep.loc[filtered_dep.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
filtered_dep
filtered_dep = filtered_dep.drop(['GEO_TYPE'], axis=1)
filtered_dep = filtered_dep.drop(['GEO_TYP2'], axis=1)
filtered_dep = filtered_dep.drop(['CDU_ID'], axis=1)
filtered_dep.head()
# Transform the data into a numpy ndarray
np_array_dep = filtered_dep.values
np_array_dep
# Removing the first four columns
np_array_dep = np_array_dep[:, 4:]
np_array_dep
# Building the PCA model
pca_dep = PCA(n_components=2).fit(np_array_dep) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_d = pca_dep.transform(np_array_dep) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame
# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_h variable has a shape of :", pca_x_d.shape)
print(pca_x_d)
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_d[:,0], pca_x_d[:,1],cmap='viridis')
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_dep.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_dep.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)
# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_dep.components_[1])[np.argsort(np.abs(pca_dep.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_dep.columns[4+np.argsort(np.abs(pca_dep.components_[1]))][::-1][:10]
# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(6):
print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))
print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(6):
print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
dep_num = filtered_dep.select_dtypes(include='number') # selects numeric columns only
dep_num.head()
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_dep = cdist(dep_num, dep_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_dep.shape)
mds_dep = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_dep = mds_dep.fit_transform(dist_mat_dep)
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_dep[:, 0], mds_x_dep[:, 1], alpha = .3, s=100)
# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_dep['GEO_LABEL']):
plt.annotate(s=name, xy=(mds_x_dep[index, 0], mds_x_dep[index, 1]))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
Description: A person is a provider of unpaid care if they look after or give help or support to family members, friends, neighbours or others because of long-term physical or mental ill-health or disability, or problems related to old age. This does not include any activities as part of paid employment. No distinction is made about whether any care that a person provides is within their own household or outside the household, so no explicit link can be made about whether the care provided is for a person within the household who has poor general health or a long-term health problem or disability.
df_uncar=pd.read_csv("Data on Unpaid Carers.csv")
df_uncar = df_uncar.iloc[1: , :]
df_uncar.head()
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
filtered_uncar = df_uncar[df_uncar['GEO_CODE'].isin(GeoCodes)]
filtered_uncar.head()
filtered_uncar = filtered_uncar.drop(['CDU_ID'], axis=1)
# Transform the data into a numpy ndarray
np_array_uncar = filtered_uncar.values
np_array_uncar
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_uncar.loc[filtered_uncar.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
filtered_uncar = filtered_uncar.drop(['GEO_TYP2'], axis=1)
filtered_uncar = filtered_uncar.drop(['GEO_TYPE'], axis=1)
filtered_uncar.head()
# Removing the first four columns
np_array_uncar = np_array_uncar[:, 4:]
# Building the PCA model
pca_uncar = PCA(n_components=2).fit(np_array_uncar) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_uncar = pca_uncar.transform(np_array_uncar) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame
# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_h variable has a shape of :", pca_x_uncar.shape)
print(pca_x_uncar)
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_uncar[:,0], pca_x_uncar[:,1],cmap='viridis')
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_uncar.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_uncar.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)
# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_uncar.components_[1])[np.argsort(np.abs(pca_uncar.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_uncar.columns[4+np.argsort(np.abs(pca_uncar.components_[1]))][::-1][:10]
# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))
print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
uncar_num = filtered_uncar.select_dtypes(include='number') # selects numeric columns only
uncar_num.head()
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_uncar = cdist(uncar_num, uncar_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_uncar.shape)
mds_uncar = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_uncar = mds_uncar.fit_transform(dist_mat_uncar)
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_uncar[:, 0], mds_x_uncar[:, 1], alpha = .3, s=100)
# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_uncar['GEO_LABEL']):
plt.annotate(s=name, xy=(mds_x_uncar[index, 0], mds_x_uncar[index, 1]))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
Economic activity relates to whether or not a person who was aged 16 and over was working or looking for work in the week before census. Rather than a simple indicator of whether or not someone was currently in employment, it provides a measure of whether or not a person was an active participant in the labour market.
A person's economic activity is derived from their 'activity last week'. This is an indicator of their status or availability for employment - whether employed, unemployed, or their status if not employed and not seeking employment. Additional information included in the economic activity classification is also derived from information about the number of hours a person works and their type of employment - whether employed or self-employed.
The census concept of economic activity is compatible with the standard for economic status defined by the <a href=http://www.ilo.org/global/statistics-and-databases/classifications/lang--en/index.htm" target="_blank">International Labour Organisation (ILO)</a>. It is one of a number of definitions used internationally to produce accurate and comparable statistics on employment
df_ea=pd.read_csv("Economic activity dataset in England .csv")
df_ea = df_ea.iloc[1: , :]
df_ea.head()
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
filtered_ea = df_ea[df_ea['GEO_CODE'].isin(GeoCodes)]
filtered_ea.head()
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_ea.loc[filtered_ea.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
filtered_ea = filtered_ea.drop(['GEO_TYP2'], axis=1)
filtered_ea = filtered_ea.drop(['GEO_TYPE'], axis=1)
filtered_ea = filtered_ea.drop(['CDU_ID'], axis=1)
# Transform the data into a numpy ndarray
np_array_ea = filtered_ea.values
np_array_ea
# Removing the first four columns
np_array_ea = np_array_ea[:, 4:]
# Building the PCA model
pca_ea = PCA(n_components=2).fit(np_array_ea) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_ea = pca_ea.transform(np_array_ea) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame
# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_ea variable has a shape of :", pca_x_ea.shape)
print(pca_x_ea)
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_ea[:,0], pca_x_ea[:,1],cmap='viridis')
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_ea.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_ea.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)
# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_ea.components_[1])[np.argsort(np.abs(pca_ea.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_ea.columns[4+np.argsort(np.abs(pca_ea.components_[1]))][::-1][:10]
# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))
print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(5):
print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
ea_num = filtered_ea.select_dtypes(include='number') # selects numeric columns only
ea_num.head()
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_ea = cdist(ea_num, ea_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_ea.shape)
mds_ea = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_ea = mds_ea.fit_transform(dist_mat_ea)
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_ea[:, 0], mds_x_ea[:, 1], alpha = .3, s=100)
# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_ea['GEO_LABEL']):
plt.annotate(s=name, xy=(mds_x_ea[index, 0], mds_x_ea[index, 1]))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
The main population base for statistics from the 2011 Census is the usual resident population as at census day, 27 March 2011. Although the population base for enumeration included non-UK born short-term residents, this population is analysed separately and is not included in the main outputs from the 2011 Census.
All statistics, unless specified, are produced using only usual residents of the UK.
For 2011 Census purposes, a usual resident of the UK is anyone who, on census day, was in the UK and had stayed or intended to stay in the UK for a period of 12 months or more, or had a permanent UK address and was outside the UK and intended to be outside the UK for less than 12 months.
For information about the main population base for statistics, how other population sub-groups are counted, and all variable definitions, see information about <a href=http://web.ons.gov.uk/ons/guide-method/census/2011/census-data/2011-census-user-guide/information-by-variable/index.html" target="_blank">variables and classifications</a>."
df_pop = pd.read_csv("Data on population in England.csv")
df_pop = df_pop.iloc[1: , :]
df_pop
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
filtered_pop = df_pop[df_pop['GEO_CODE'].isin(GeoCodes)]
filtered_pop.head()
Check for NaN values which is causing an error for our PCA calculation
filtered_pop.isnull().values.any()
filtered_pop.isnull().sum().sum()
count_nan_in_df = filtered_pop.isnull().sum()
print (count_nan_in_df)
df133 = filtered_pop[filtered_pop.isna().any(axis=1)]
#The two rows in the dataframes which have NaN values
df133
We can fix this by calculating density (Population/Area)
Population_Density_Persons_Cornwall = (534476.0/356256.0)
Population_Density_Persons_Westminster = (226771.0/2438.0)
print(Population_Density_Persons_Cornwall)
print(Population_Density_Persons_Westminster)
We know that:
City of London, Westminster index = 110
&
Cornwall, Isles of Scilly = 109
Therefore we use the iloc function to add:
filtered_pop.update(filtered_pop.iloc[[109]].fillna('1.5002582412647085'))
filtered_pop.update(filtered_pop.iloc[[110]].fillna('93.01517637407711'))
filtered_pop.iloc[109]
Done! Now lets continue
filtered_pop
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_pop.loc[filtered_pop.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
filtered_pop = filtered_pop.drop(['GEO_TYP2'], axis=1)
filtered_pop = filtered_pop.drop(['GEO_TYPE'], axis=1)
filtered_pop.head()
filtered_pop = filtered_pop.drop(['CDU_ID'], axis=1)
# Transform the data into a numpy ndarray
np_array_pop = filtered_pop.values
np_array_pop
# Removing the first four columns
np_array_pop = np_array_pop[:, 4:]
# Building the PCA model
pca_pop = PCA(n_components=2).fit(np_array_pop) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_pop = pca_pop.transform(np_array_pop) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame
# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_pop variable has a shape of :", pca_x_pop.shape)
print(pca_x_pop)
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_pop[:,0], pca_x_pop[:,1],cmap='viridis')
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_pop.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_pop.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)
# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_pop.components_[1])[np.argsort(np.abs(pca_pop.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_pop.columns[4+np.argsort(np.abs(pca_pop.components_[1]))][::-1][:10]
# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))
print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(7):
print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
pop_num = filtered_pop.select_dtypes(include='number') # selects numeric columns only
pop_num.head()
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_pop = cdist(pop_num, pop_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_pop.shape)
mds_pop = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_pop = mds_pop.fit_transform(dist_mat_pop)
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_pop[:, 0], mds_x_pop[:, 1], alpha = .3, s=100)
# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_pop['GEO_LABEL']):
plt.annotate(s=name, xy=(mds_x_pop[index, 0], mds_x_pop[index, 1]))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
df_ethnic=pd.read_csv("Data on ethnicity in England Geo.csv")
df_ethnic = df_ethnic.iloc[1: , :]
df_ethnic
GeoCodes=["E09000002","E06000022","E06000055","E06000037","E09000004","E06000008","E06000028","E09000005","E06000043","E06000023","E09000006","E10000002","E10000003","E09000007","E06000049","E41000052","E09000008","E10000006","E06000005","E07000035","E06000015","E07000042","E10000009","E06000047","E09000009","E06000011","E10000011","E09000010","E10000012","E10000013","E09000011","E09000012","E06000006","E09000013","E10000014","E09000014","E09000015","E06000001","E09000016","E06000019","E10000015","E09000017","E09000018","E06000046","E09000019","E09000020","E10000016","E06000010","E09000021","E09000022","E10000017","E06000016","E10000018","E09000023","E10000019","E02000001","E06000032","E08000003","E06000035","E11000002","E09000024","E06000002","E06000042","E09000025","E10000020","E06000012","E06000013","E06000024","E10000023","E10000021","E06000048","E06000018","E10000024","E10000025","E06000031","E06000026","E06000029","E06000044","E09000026","E06000003","E09000027","E06000017","E06000051","E10000027","E06000025","E11000003","E06000045","E06000033","E09000028","E10000028","E06000021","E06000004","E10000029","E10000030","E09000029","E06000030","E06000020","E06000034","E06000027","E09000030","E11000004","E09000031","E09000032","E06000007","E10000031","E11000005","E10000032","E11000006","E41000324","E06000054","E10000034","E06000014"]
filtered_ethnic = df_ethnic[df_ethnic['GEO_CODE'].isin(GeoCodes)]
print(filtered_ethnic.head())
filtered_ethnic.columns
filtered_ethnic.rename(columns = {'Ethnic group [E][S][W] : Total\ Ethnic group - Unit : Persons':'Total\ Ethnic group - Unit : Persons',
'Ethnic group [E][S][W] : White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons':'White\ English/Welsh/Scottish/Northern Irish/British - Unit : Persons',
'Ethnic group [E][S][W] : White\ Irish - Unit : Persons':'White\ Irish - Unit : Persons',
'Ethnic group [E][S][W] : White\ Gypsy or Irish Traveller - Unit : Persons':'White\ Gypsy or Irish Traveller - Unit : Persons',
'Ethnic group [E][S][W] : White\ Other White - Unit : Persons':'White\ Other White - Unit : Persons',
'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ White and Black Caribbean - Unit : Persons':'Mixed/multiple ethnic group\ White and Black Caribbean - Unit : Persons',
'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ White and Black African - Unit : Persons':'Mixed/multiple ethnic group\ White and Black African - Unit : Persons',
'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ White and Asian - Unit : Persons':'Mixed/multiple ethnic group\ White and Asian - Unit : Persons',
'Ethnic group [E][S][W] : Mixed/multiple ethnic group\ Other Mixed - Unit : Persons':'Mixed/multiple ethnic group\ Other Mixed - Unit : Persons',
'Ethnic group [E][S][W] : Asian/Asian British\ Indian - Unit : Persons':'Asian/Asian British\ Indian - Unit : Persons',
'Ethnic group [E][S][W] : Asian/Asian British\ Pakistani - Unit : Persons':'Asian/Asian British\ Pakistani - Unit : Persons',
'Ethnic group [E][S][W] : Asian/Asian British\ Bangladeshi - Unit : Persons':'Asian/Asian British\ Bangladeshi - Unit : Persons',
'Ethnic group [E][S][W] : Asian/Asian British\ Chinese - Unit : Persons':'Asian/Asian British\ Chinese - Unit : Persons',
'Ethnic group [E][S][W] : Asian/Asian British\ Other Asian - Unit : Persons':'Asian/Asian British\ Other Asian - Unit : Persons',
'Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ African - Unit : Persons':'Black/African/Caribbean/Black British\ African - Unit : Persons',
'Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Caribbean - Unit : Persons':'Black/African/Caribbean/Black British\ Caribbean - Unit : Persons',
'Ethnic group [E][S][W] : Black/African/Caribbean/Black British\ Other Black - Unit : Persons':'Black/African/Caribbean/Black British\ Other Black - Unit : Persons',
'Ethnic group [E][S][W] : Other ethnic group\ Arab - Unit : Persons':'Number of people classified as Arab ethnic group',
'Ethnic group [E][S][W] : Other ethnic group\ Any other ethnic group - Unit : Persons':'Other ethnic group\ Any other ethnic group - Unit : Persons'}, inplace = True)
filtered_ethnic
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000037', 'GEO_LABEL'] = "Berkshire"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000023', 'GEO_LABEL'] = "Bristol"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000049', 'GEO_LABEL'] = "Cheshire"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E41000052', 'GEO_LABEL'] = "Cornwall"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E07000035', 'GEO_LABEL'] = "Derbyshire"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E07000042', 'GEO_LABEL'] = "Devon"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000047', 'GEO_LABEL'] = "Durham"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000019', 'GEO_LABEL'] = "Herefordshire"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000010', 'GEO_LABEL'] = "Kingston upon Hull"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "City of London"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E41000324', 'GEO_LABEL'] = "Westminster"
filtered_ethnic.loc[filtered_ethnic.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stockton-on-Trent"
filtered_ethnic = filtered_ethnic.drop(['GEO_TYPE'], axis=1)
filtered_ethnic = filtered_ethnic.drop(['GEO_TYP2'], axis=1)
filtered_ethnic = filtered_ethnic.drop(['CDU_ID'], axis=1)
# Transform the data into a numpy ndarray
np_array_ethnic = filtered_ethnic.values
np_array_ethnic
# Removing the first four columns
np_array_ethnic = np_array_ethnic[:, 4:]
# Building the PCA model
pca_ethnic = PCA(n_components=2).fit(np_array_ethnic) # We're telling PCA here to keep only the 2 first principal components when fitting the data
pca_x_ethnic = pca_ethnic.transform(np_array_ethnic) # This variable will contain the coordinates of each row in the new system of coordinates found by PCA (restricted to the first two directions or principal components)
# Those 2 Principal Components are going to be used to project the data onto a 2D space in order to be able
# to visualize a "compressed" version (in terms of variance) of our initial multivariate DataFrame
# Let's check out the result of our PCA applied to the dataFrame df
print("The pca_x_ethnic variable has a shape of :", pca_x_ethnic.shape)
print(pca_x_ethnic)
plt.subplot(1,2,1)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Projection of the original data onto the first 2 PCs")
plt.scatter(pca_x_ethnic[:,0], pca_x_ethnic[:,1],cmap='viridis')
# Ordering the features by their coefficients in the equation of the first Principal Component
# PC1 = a*column1 + b*column2 + ... + zzz*column101. The coefficients of PC1 a, b, c, ..zzz also called loadings, are stored in pca_clean.components_[0]
pca_x_1 = pca_ethnic.components_[0] # First principal component
sort_array_1 = np.argsort(np.abs(pca_x_1)) # Sorting the 'Indexes' of the pca_clean_1 array from the lowest value to the highest (in magnitude, ignoring the sign)
pca_x_1_highest_values = pca_x_1[sort_array_1][::-1][:10] # We sorted from low to high, [::-1] retrieves all values backward (to inverse the sort), [:10] get the first
pca_x_1_cols_highest_values = filtered_ethnic.columns[4+sort_array_1][::-1][:10] # retrieving the names of the columns corresponding to the highest values
# The '1+sort_array' accounts for the fact that df has the 'communityname' column and sort_array sorted indices in an array that didn't have the column (we dropped it before doing PCA)
# Same thing but ... a little bit more compact
pca_x_2_highest_values = np.asarray(pca_ethnic.components_[1])[np.argsort(np.abs(pca_ethnic.components_[1]))][::-1][:10]
pca_x_2_cols_highest_values = filtered_ethnic.columns[4+np.argsort(np.abs(pca_ethnic.components_[1]))][::-1][:10]
# Printing the result
print('\nFirst Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(10):
print('The Column "{}" has a loading of : {}'.format(pca_x_1_cols_highest_values[i], pca_x_1_highest_values[i]))
print('\nSecond Principal Component : 10 Highest Loadings (in magnitude)\n')
for i in range(10):
print('The Column "{}" has a loading of : {}'.format(pca_x_2_cols_highest_values[i], pca_x_2_highest_values[i]))
# Let's extract the numeric columns and turn them into a numpy 2d array (matrix)
ethnic_num = filtered_ethnic.select_dtypes(include='number') # selects numeric columns only
ethnic_num.head()
# Let's compute the distance matrix of pairwise euclidean distances
dist_mat_ethnic = cdist(ethnic_num, ethnic_num) # computes the distances between rows of first dataframe and rows of second dataframe. Here I gave the same dataFrame twice, to get a pairwise distance matrix
print("The shape of this matrix is", dist_mat_ethnic.shape)
mds_ethnic = MDS(n_components=2, random_state=123, dissimilarity='precomputed') # dissimilarity precomputed means that we provide the distance matrix, otherwise MDS() can compute it itself
mds_x_ethnic = mds_ethnic.fit_transform(dist_mat_ethnic)
# Let's visualize the results
plt.figure(num=5, figsize=(25, 12))
plt.scatter(mds_x_ethnic[:, 0], mds_x_ethnic[:, 1], alpha = .3, s=100)
# We need to loop over every borough, get its name from the borough dataframe and its location from mds_x and add the text to our scatterplot
for index,name in enumerate(filtered_ethnic['GEO_LABEL']):
plt.annotate(s=name, xy=(mds_x_ethnic[index, 0], mds_x_ethnic[index, 1]))
plt.xlabel('Axis 1')
plt.ylabel('Axis 2')
plt.title('Multidimensional Scaling on Districts and subdivisions (All rows)');
Review:
Gender = filtered_Gender
Vehicle accessibility = filtered_vehi
Religion = filtered_reli
health = filtered_health
deprivation = filtered_dep
unpaid carers = filtered_uncar
economic activity = filtered_ea
population = filtered_pop
ethnicity = filtered_ethnic
Lets check all shapes of dataframe
# saving the filtered_gender
filtered_Gender.to_csv('filtered_gender_england_dataset.csv', header=True, index=False)
filtered_vehi.to_csv('filtered_vehicle_access_england_dataset.csv', header=True, index=False)
filtered_reli.to_csv('filtered_religion_england_dataset.csv', header=True, index=False)
filtered_health.to_csv('filtered_health_england_dataset.csv', header=True, index=False)
filtered_dep.to_csv('filtered_deprivation_england_dataset.csv', header=True, index=False)
filtered_uncar.to_csv('filtered_unpaid_carers_england_dataset.csv', header=True, index=False)
filtered_ea.to_csv('filtered_econ_activity_england_dataset.csv', header=True, index=False)
filtered_pop.to_csv('filtered_population_england_dataset.csv', header=True, index=False)
filtered_ethnic.to_csv('filtered_ethnic_england_dataset.csv', header=True, index=False)
filtered_Gender.shape
filtered_vehi.shape
filtered_reli.shape
filtered_health.shape
filtered_dep.shape
filtered_uncar.shape
filtered_ea.shape
filtered_pop.shape
filtered_ethnic.shape
#Data on crime commited in the city of London between 03-06(lockdown) merged
df_crime_all=[crime_oct2019,crime_Nov2019,crime_dec2019,crime_jan2020,crime_feb2020,crime_march2020,crime_april2020,crime_may2020,crime_june2020,crime_july2020,crime_august2020,crime_Sep2020,crime_oct2020];
df_crime= pd.concat(df_crime_all , axis=0, join='outer',ignore_index=True, keys=None,
levels=None, names=None, verify_integrity=False, copy=True)
df_all123 = pd.concat(df_all,axis=0,join='outer',ignore_index=True,keys=None,levels=None,names=None,verify_integrity=True,copy=False)
df_test=pd.concat(df_all,axis=1)
df_genvehi=pd.merge(filtered_Gender,filtered_vehi)
#df_all=[filtered_Gender,filtered_vehi,filtered_reli,filtered_health,filtered_dep,filtered_uncar,filtered_ea,filtered_pop,filtered_ethnic];
df_genvehi.head()
df_3=pd.merge(df_genvehi,filtered_reli)
df_3.head()
df_4=pd.merge(df_3,filtered_health)
df_4.head()
df_5=pd.merge(df_4,filtered_dep)
df_5.head()
df_6=pd.merge(df_5,filtered_ea)
df_6.head()
df_7=pd.merge(df_6,filtered_pop)
df_7.head()
df_8=pd.merge(df_7,filtered_uncar)
df_8.head()
df_9=pd.merge(df_8,filtered_ethnic)
df_9
Key = df_9[df_9['GEO_CODE']=='E02000001']
Key
df_9.isnull().values.any()
#last two changes needed to be made
df_9.loc[df_9.GEO_CODE == 'E06000055', 'GEO_LABEL'] = "Bedfordshire"
df_9.loc[df_9.GEO_CODE == 'E02000001', 'GEO_LABEL'] = "London"
df_9.loc[df_9.GEO_CODE == 'E06000021', 'GEO_LABEL'] = "Stoke-on-Trent"
df_9.to_csv('Combined_England_dataset_Engineered.csv', header=True, index=False)
Load=pd.read_csv('Combined_England_dataset_Engineered.csv')
Load